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In [1]: import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 
sns.set theme(color codes=True) 
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In [2]: df = pd.read csv('kidney-stone-dataset.csv') 


1.021 
1.017 
1.008 


1.011 


df.head() 
Out[2]: 
Unnamed: 0 gravity 

0 0 
1 1 
2 2 
3 3 
4 4 


In [3]: df.drop(columns='Unnamed: 


In [4]: df.head() 


Out [4]: 
gravity ph 


0 1.021 4.91 
1 1.017 5.74 
2 1.008 7.20 
3 1.011 5.51 
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In [6]: num vars = ["gravity', 'ph', ‘osmo', 'cond', ‘urea’, 'calc'] 


fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 10)) 
axs = axs.flatten() 


for i, var in enumerate(num vars): 
sns.boxplot(x=var, data=df, ax=axs[i]) 


fig.tight layout() 


plt.show() 
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In [7]: num vars = ["gravity', 'ph', ‘osmo', 'cond', ‘urea’, 'calc'] 
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 10)) 


axs = axs.flatten() 


for i, var in enumerate(num vars): 
sns.violinplot(x=var, data=df, ax=axs[i]) 


fig.tight layout() 


plt.show() 
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In [13]: num vars = ['gravity', 'ph', 'osmo", 'cond', ‘urea', 'calc'] 


fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 10)) 
axs = axs.flatten() 


for i, var in enumerate(num vars): 
sns.histplot(x=var, data=df, ax=axs[i]) 
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fig.tight_layout() 


plt.show() 
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In [14]: num vars = ['gravity', 'ph', 'osmo", 'cond', ‘urea', 'calc'] 


fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(20, 10)) 
axs = axs.flatten() 


for i, var in enumerate(num vars): 
sns.histplot(x=var, data=df, hue='target', ax=axs[i]) 


fig.tight layout() 


plt.show() 
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In [16]: plt.figure(figsize=(8,8),dpi=100) 


sns.scatterplot(x="calc", y="urea", hue="target", data=df, edgecolor="black") 


plt.show() 
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In [17]: plt.figure(figsize=(8,8),dpi=100) 
sns.scatterplot(x="cond", y="urea", hue="target", data=df, edgecolor="black") 


plt.show() 
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Data Preproceessing 


In [18]: Check missing value 
check missing = df.isnull().sum() * 100 / df.shape[0] 
check missing[check missing > 0].sort values(ascending=False) 


Out[18]: Series([], dtype: float64) 


Check "Target value is it balanced or not 
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In [19]: sns.countplot(df['target']) 
df[ "target '].value counts() 


D: \anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following varia 
ble as a keyword arg: x. From version 0.12, the only valid positional argument will be “data”, an 


d passing other arguments without an explicit keyword will result in an error or misinterpretatio 
n. 


warnings.warn( 
Out[19]: @ 45 


1 45 
Name: target, dtype: int64 


count 


10 


target 


There's no outlier, so we don't have to remove it 
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In [21]: plt.figure(figsize=(15,12)) 
sns.heatmap(df.corr(), fmt=".2g", annot=True) 


out [21]: <AxesSubplot:> 
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Train test split 


In [22]: 


x 
UI 


df.drop('target', axis=1) 
df['target'] 


< 
U 


In [23]: test size 20% and train size 80% 
from sklearn.model selection import train test split 
from sklearn.metrics import accuracy score 
X train, X test, y train, y test = train test split(X,y, test size=0.2,random state=0) 


Machine Learrning Model Building 


Decision Tree 


localhost:8888/notebooks/Kidney Stone Prediction.ipynb 9/22 


4/20/23, 11:22 PM 


In [24]: 


In [25]: 


out [25]: 


In [26]: 


In [27]: 
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from sklearn.tree import DecisionTreeClassifier 
from sklearn.model selection import GridSearchCV 
dtree = DecisionTreeClassifier() 
param grid = { 

"max depth': [3, 4, 5, 6, 7, 8], 

"min samples split': [2, 3, 4], 

"min samples leaf': [1, 2, 3, 4] 
} 


# Perform a grid search with cross-validation to find the best hyperparameters 
grid search = GridSearchCV(dtree, param grid, cv=5) 


grid search.fit(X train, y train) 


# Print the best hyperparameters 
print(grid search.best params ) 


("max depth': 7, "min samples leaf': 1, "min samples split': 4} 


from sklearn.tree import DecisionTreeClassifier 


dtree = DecisionTreeClassifier (random state=9, max depth=7, min samples leaf=1, min samples split=: 


dtree.fit(X train, y train) 


DecisionTreeClassifier(max depth=7, min samples split=4, random state=0) 


y pred = dtree.predict(X test) 


print("Accuracy Score : 


Accuracy Score : 66.67 % 


, round(accuracy score(y test, y pred)*100 ,2), "%") 


from sklearn.metrics import accuracy score, f1 score, precision score, recall score, jaccard score 


print('F-1 Score : ",(f1 score(y test, y pred, average='micro'))) 

print( "Precision Score : ',(precision score(y test, y pred, average='micro'))) 
print('Recall Score : ',(recall_score(y test, y pred, average='micro'))) 
print('Jaccard Score : ',(jaccard score(y test, y pred, average='micro'))) 


print('Log Loss : ',(log loss(y test, y pred))) 


F-1 Score : @.6666666666666666 
Precision Score : Q.6666666666666666 
Recall Score : 9.6666666666666666 
Jaccard Score : 0.5 

Log Loss : 11.513014309129138 
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In [28]: imp df = pd.DataFrame(( 
"Feature Name": X train.columns, 
"Importance": dtree.feature importances . 


}) 


fi = imp_df.sort_values(by="Importance", ascending=False) 


fi2 = fi.head(10) 

plt.figure(figsize=(10,8)) 

sns.barplot(data=fi2, x='Importance', y='Feature Name") 

plt.title('Top 10 Feature Importance Each Attributes (Decision Tree)', fontsize=18) 
plt.xlabel ('Importance', fontsize=16) 

plt.ylabel ('Feature Name", fontsize=16) 

plt.show() 


Top 10 Feature Importance Each Attributes (Decision Tree) 
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In [29]: # compute SHAP values 
import shap 
explainer = shap.TreeExplainer(dtree) 
shap values = explainer.shap values(X test) 
shap.summary plot(shap values[1], X test.values, feature names = X test.columns) 
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In [47]: from sklearn.metrics import confusion matrix 
cm = confusion matrix(y test, y pred) 
plt.figure(figsize=(5,5)) 
sns.heatmap(data=cm, linewidths=.5, annot=True, cmap = 'Blues') 
plt.ylabel('Actual label") 
plt.xlabel('Predicted label’) 


all sample title = 'Accuracy Score for Decision Tree: (0)'.format(dtree.score(X test, y test)) 
plt.title(all sample title, size = 15) 


Out[47]: Text(0.5, 1.0, 'Accuracy Score for Decision Tree: 0.6666666666666666') 


Accuracy Score for Decision Tree: 0.6666666666666666 
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In [48]: 


out [48]: 
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from sklearn.metrics import roc curve, roc auc score 
y pred proba = dtree.predict proba(X test)[:][:,1] 


df actual predicted = pd.concat([pd.DataFrame(np.array(y test), columns=['y actual']), pd.DataFram 


df actual predicted.index = y test.index 


fpr, tpr, tr = roc curve(df actual predicted['y actual'], df actual predicted['y pred proba']) 
auc = POC auc score(df actual predicted['y actual'], df actual predicted['y pred proba']) 


plt.plot(fpr, tpr, label='AUC = %9.4f"' %auc) 
plt.plot(fpr, fpr, linestyle = '--', color="k") 
plt.xlabel('False Positive Rate") 
plt.ylabel('True Positive Rate") 

plt.title('ROC Curve', size = 15) 


plt.legend() 


<matplotlib.legend.Legend at Øxlaecbdø67fØ> 


ROC Curve 
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In [49]: 


Ti [54]: 


out [51]: 


In [52]: 


En: [53]: 
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from sklearn.ensemble import RandomForestClassifier 
from sklearn.model selection import GridSearchCV 
rfc = RandomForestClassifier() 
param grid = { 

'n estimators': [100, 200], 

'max depth': [None, 5, 10], 

"max features": ['sgrt', '10g2', None] 


} 


# Perform a grid search with cross-validation to find the best hyperparameters 


grid_search = GridSearchCV(rfc, param_grid, cv=5) 
grid_search.fit(X_train, y_train) 


# Print the best hyperparameters 
print(grid_search.best_params_) 


{'max_depth': None, 'max features": None, 'n estimators': 200) 


from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier (random state=0) 
rfc.fit(X train, y train) 


RandomForestClassifier(random state=0) 


y pred = rfc.predict(X test) 


print("Accuracy Score : 


Accuracy Score : 88.89 % 


, round(accuracy score(y test, y pred)*100 ,2), "%") 


from sklearn.metrics import accuracy score, f1 score, precision score, recall score, jaccard score 


print('F-1 Score : ",(f1 score(y test, y pred, average='micro'))) 

print( "Precision Score : ',(precision score(y test, y pred, average='micro'))) 
print('Recall Score : ', (recall score(y test, y pred, average="micro"))) 
print('Jaccard Score : '",(jaccard score(y test, y pred, average='micro'))) 


print('Log Loss : ',(log loss(y test, y pred))) 


F-1 Score : 0.8888888888888888 
Precision Score : 0.8888888888888888 
Recall Score : 0.8888888888888888 
Jaccard Score : 0.8 

Log Loss : 3.837686243736199 
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In [54]: 
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imp df = pd.DataFrame(( 
"Feature Name": X train.columns, 
"Importance": rfc.feature importances 


}) 


fi = imp_df.sort_values(by="Importance", ascending=False) 


fi2 = fi.head(19) 

plt.figure(figsize=(10,8)) 

sns.barplot(data=fi2, x='Importance', y='Feature Name") 

plt.title('Top 10 Feature Importance Each Attributes (Random Forest)', fontsize=18) 
plt.xlabel ('Importance', fontsize=16) 

plt.ylabel ('Feature Name", fontsize=16) 

plt.show() 


Top 10 Feature Importance Each Attributes (Random Forest) 
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In [55]: # compute SHAP values 
import shap 
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explainer = shap.TreeExplainer(rfc) 
shap values = explainer.shap values(X test) 


shap.summary plot(shap values[1], X test.values, feature names = X test.columns) 
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In [57]: from sklearn.metrics import confusion matrix 
cm = confusion matrix(y test, y pred) 
plt.figure(figsize=(5,5)) 
sns.heatmap(data=cm, linewidths=.5, annot=True, cmap = 'Blues') 
plt.ylabel('Actual label’) 
plt.xlabel('Predicted label’) 
all sample title = 'Accuracy Score for Random Forest: (0)'.format(rfc.score(X test, y test)) 
plt.title(all sample title, size = 15) 


Out[57]: Text(0.5, 1.0, 'Accuracy Score for Random Forest: 0.8888888888888888') 


Accuracy Score for Random Forest: 0.8888888888888888 
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In [58]: from sklearn.metrics import roc curve, roc auc score 
y pred proba = rfc.predict proba(X test)[:][:,1] 


df actual predicted = pd.concat([pd.DataFrame(np.array(y test), columns=['y actual']), pd.DataFram 


df actual predicted.index = y test.index 


fpr, tpr, tr = roc curve(df actual predicted['y actual'], df actual predicted['y pred proba']) 
auc = POC auc score(df actual predicted['y actual'], df actual predicted['y pred proba']) 


plt.plot(fpr, tpr, label='AUC = %@.4f' %auc) 
plt.plot(fpr, fpr, linestyle = '--', color="k") 
plt.xlabel('False Positive Rate") 
plt.ylabel('True Positive Rate") 
plt.title('ROC Curve', size = 15) 


plt.legend() 


Out[58]: <matplotlib.legend.Legend at Øxlaecca7cfdø> 
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In [61]: from sklearn.linear model import LogisticRegression 
from sklearn.model selection import GridSearchCV 


# Create a Logistic Regression model 
logreg = LogisticRegression(solver='liblinear', max iter=10000) 


# Define the parameter grid 
param grid = { 
"penalty": ['11', '12'], 
'C': [0.01, 0.1, 1, 10] 
} 


# Perform a grid search with cross-validation to find the best hyperparameters 
grid search = GridSearchCV(logreg, param grid, cv=5) 
grid search.fit(X train, y train) 


# Print the best hyperparameters 
print(grid search.best params ) 


{'C': 0.1, 'penalty': '11') 


In [62]: from sklearn.ensemble import RandomForestClassifier 
logreg = LogisticRegression(solver='liblinear', max iter=10000, C=0.1, penalty="11") 
logreg.fit(X train, y train) 


Out[62]: LogisticRegression(C=0.1, max iter=10000, penalty='11', solver="liblinear") 


In [63]: y pred = logreg.predict(X test) 
print("Accuracy Score :", round(accuracy score(y test, y pred)*100 ,2), "%") 


Accuracy Score : 66.67 % 


In [65]: from sklearn.metrics import accuracy score, f1 score, precision score, recall score, jaccard score 
print('F-1 Score : ', (fl score(y test, y pred, average='micro'))) 
print('Precision Score : ',(precision score(y test, y pred, average='micro'))) 
print('Recall Score : ', (recall score(y test, y pred, average='micro'))) 
print('Jaccard Score : ',(jaccard score(y test, y pred, average='micro'))) 
print('Log Loss : ',(log loss(y test, y pred))) 


F-1 Score : @.6666666666666666 
Precision Score : Q.6666666666666666 
Recall Score : 9.6666666666666666 
Jaccard Score : 0.5 

Log Loss : 11.513058731208593 
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In [67]: from sklearn.metrics import confusion matrix 
cm = confusion matrix(y test, y pred) 
plt.figure(figsize=(5,5)) 
sns.heatmap(data=cm, linewidths=.5, annot=True, cmap = 'Blues') 
plt.ylabel('Actual label") 
plt.xlabel('Predicted label’) 
all sample title = 'Accuracy Score for Logistic Regression: (0)'.format(logreg.score(X test, y tes; 
plt.title(all sample title, size = 15) 


Out[67]: Text(0.5, 1.0, 'Accuracy Score for Logistic Regression: @.6666666666666666' ) 


Accuracy Score for Logistic Regression: 0.6666666666666666 
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In [68]: from sklearn.metrics import roc curve, roc auc score 
y pred proba = logreg.predict proba(X test)[:][:,1] 


df actual predicted = pd.concat([pd.DataFrame(np.array(y test), columns=['y actual']), pd.DataFram 


df actual predicted.index = y test.index 


fpr, tpr, tr = roc curve(df actual predicted['y actual'], df actual predicted['y pred proba']) 
auc = POC auc score(df actual predicted['y actual'], df actual predicted['y pred proba']) 


plt.plot(fpr, tpr, label='AUC = %0.4f' %auc) 
plt.plot(fpr, fpr, linestyle = '--"', color="k") 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.title('ROC Curve', size = 15) 


plt.legend() 


Out[68]: <matplotlib.legend.Legend at @xlaecc93419@> 
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